Set up

suppressPackageStartupMessages({
  library(tidyverse)
})

Directories and File Inputs/Outputs

# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))

Read in data and process

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))
Error in `left_join()`:
! Join columns in `x` must be present in the data.
✖ Problem with `Kids_First_Biospecimen_ID`.
Backtrace:
 1. ... %>% filter(!is.na(tmb))
 4. dplyr:::left_join.data.frame(., tmb_vaf_df, by = c("Kids_First_Biospecimen_ID"))

Define parameters for plots

# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$Variant_Classification
# Define label for plots
Alteration_type <- df_plot$Variant_Classification

# Define ylim
ylim <- max(df_plot$log10_tmb)

What type of alterations we observe per tumor descriptor?

# Create bxp
print(ggpubr::ggboxplot(df_plot, 
                        x = \tumor_descriptor\, 
                        y = \log10_tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        theme_Publication() + 
        scale_y_continuous(limits = c(0, ylim)) +
        xlab(\Timepoint\) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = \Alteration_type_timepoints.pdf\, 
       path = plots_dir, 
       width = 15, 
       height = 8, 
       device = \pdf\, 
       useDingbats = FALSE)

What type of alterations we observe per tumor descriptor in each cancer group?

# Create bxp
print(ggpubr::ggboxplot(df_plot, 
                        x = \tumor_descriptor\, 
                        y = \log10_tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        facet_wrap(~cg_id) +
        theme_Publication() + 
        xlab(\Timepoint\) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = \Alteration_type_cancer_group.pdf\, 
       path = plots_dir, 
       width = 25, 
       height = 18, 
       device = \pdf\, 
       useDingbats = FALSE)

What type of alterations we observe per tumor descriptor in each cancer group defined by cgGFAC?

df_plot_cgGFAC <- df_plot %>% 
  arrange(tumor_descriptor_cgGFAC_n)
  #mutate(tumor_descriptor_cgGFAC_n = factor(tumor_descriptor_cgGFAC_n)) 

#df_plot_cgGFAC$tumor_descriptor_cgGFAC_n %>% levels()

# Create bxp
print(ggpubr::ggboxplot(df_plot_cgGFAC, 
                        x = \tumor_descriptor_cgGFAC_n\, 
                        y = \log10_tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        theme_Publication() + 
        xlab(\Timepoint\) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = \Alteration_type_cgGFAC.pdf\, 
       path = plots_dir, 
       width = 14, 
       height = 8, 
       device = \pdf\, 
       useDingbats = FALSE)
cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC))
cgGFAC_id
[1] \ATRT\  \DMG\   \HGG\   \LGG\   \Other\
ATRT

DMG

HGG

LGG

Other
# Loop through variable
for (i in seq_along(cgGFAC_id)){
  print(i)
  df_sub <- df_plot_cgGFAC %>%
      filter(cgGFAC == cgGFAC_id[i])

  
   # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = \tumor_descriptor_cgGFAC_n\, 
                        y = \log10_tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        theme_Publication() + 
        labs(title = paste(cgGFAC_id[i])) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))
}
[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

What type of alterations we observe per tumor descriptor in each cancer group (add _n))?

cg <- as.character(unique(df_plot$cg_id))
cg
 [1] \Embryonal tumor with multilayer rosettes\
 [2] \Low-grade glioma\                        
 [3] \Ependymoma\                              
 [4] \Medulloblastoma\                         
 [5] \Atypical Teratoid Rhabdoid Tumor\        
 [6] \Diffuse midline glioma\                  
 [7] \High-grade glioma\                       
 [8] \Ganglioglioma\                           
 [9] \Meningioma\                              
[10] \Pilocytic astrocytoma\                   
[11] \CNS Embryonal tumor\                     
[12] \Neuroblastoma\                           
[13] \Schwannoma\                              
[14] \Chordoma\                                
[15] \Malignant peripheral nerve sheath tumor\ 
[16] \Choroid plexus carcinoma\                
[17] \Adamantinomatous Craniopharyngioma\      
[18] \Dysembryoplastic neuroepithelial tumor\  
[19] \Ewing sarcoma\                           
[20] \Rosai-Dorfman disease\                   
[21] \Neurofibroma/Plexiform\                  
[22] \Glial-neuronal tumor\                    
[23] \Hemangioblastoma\                        
[24] \Craniopharyngioma\                       
Embryonal tumor with multilayer rosettes

Low-grade glioma

Ependymoma

Medulloblastoma

Atypical Teratoid Rhabdoid Tumor

Diffuse midline glioma

High-grade glioma

Ganglioglioma

Meningioma

Pilocytic astrocytoma

CNS Embryonal tumor

Neuroblastoma

Schwannoma

Chordoma

Malignant peripheral nerve sheath tumor

Choroid plexus carcinoma

Adamantinomatous Craniopharyngioma

Dysembryoplastic neuroepithelial tumor

Ewing sarcoma

Rosai-Dorfman disease

Neurofibroma/Plexiform

Glial-neuronal tumor

Hemangioblastoma

Craniopharyngioma
# Loop through variable
for (i in seq_along(cg)){
  print(i)
  df_sub <- df_plot %>%
      filter(cg_id == cg[i])
  
  # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = \tumor_descriptor_cg_n\, 
                        y = \log10_tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        theme_Publication() + 
        xlab(\Timepoint\) +
        labs(title = paste(cg[i])) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))
}
[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

[1] 14

[1] 15

[1] 16

[1] 17

[1] 18

[1] 19

[1] 20

[1] 21

[1] 22

[1] 23

[1] 24

What type of alterations we observe per tumor descriptor in each cancer group and timepoint model?

tm <- as.character(unique(df_plot$timepoints_models))
tm
 [1] \Dx-Rec\         \Dx-Pro\         \Dx-Dec\         \Pro-Dec\       
 [5] \Rec-SM\         \Pro-Rec\        \Dx-SM\          \Pro-Rec-Dec\   
 [9] \Dx-Pro-Rec\     \Rec-Dec\        \Dx-Pro-Rec-Dec\ \Dx-Rec-Dec\    
[13] \Dx-Pro-Dec\    
Dx-Rec

Dx-Pro

Dx-Dec

Pro-Dec

Rec-SM

Pro-Rec

Dx-SM

Pro-Rec-Dec

Dx-Pro-Rec

Rec-Dec

Dx-Pro-Rec-Dec

Dx-Rec-Dec

Dx-Pro-Dec
# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- df_plot %>%
      filter(timepoints_models == tm[i])
  
  # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = \tumor_descriptor\, 
                        y = \tmb\, 
                        color = \Variant_Classification\,
                        palette = palette) +
        facet_wrap(~cancer_group) +
        theme_Publication() +
        ylab(\TMB\) +
        xlab(\Timepoint\) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))
}
[1] 1

[1] 2

[1] 3

[1] 4

[1] 5

[1] 6

[1] 7

[1] 8

[1] 9

[1] 10

[1] 11

[1] 12

[1] 13

sessionInfo()
---
title: "Classification of Variants across paired longitudinal samples in the PBTA Cohort"
author: 'Antonia Chroni <chronia@chop.edu> for D3B'
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
})
```

# Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
```

# Read in data and process

```{r load-process-inputs}
pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,	region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

df <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         log10_tmb = abs(log10(tmb)))

# Let's count #samples per cancer groups and timepoints.
# We will use the cg_id col that indicates cancer type as identified at the first diagnostic sample
timepoint_cg_n_df <- df %>% 
  count(cg_id, tumor_descriptor) %>% 
  dplyr::mutate(tumor_descriptor_cg_n = glue::glue("{cg_id}_{tumor_descriptor}  (N={n})")) %>% 
  dplyr::rename(timepoint_cg_n = n) 

# Let's count #samples per cancer groups and timepoints 
timepoint_cgGFAC_n_df <- df %>% 
  count(cgGFAC, td_cgGFAC) %>% 
  dplyr::mutate(tumor_descriptor_cgGFAC_n = glue::glue("{cgGFAC}_{td_cgGFAC}  (N={n})")) %>% 
  dplyr::rename(timepoint_cgGFAC_n = n) 

# Create df to use for plots
df_plot <- df %>% 
  left_join(timepoint_cg_n_df, by = c("tumor_descriptor", "cg_id")) %>%
  left_join(timepoint_cgGFAC_n_df, by = c("td_cgGFAC", "cgGFAC")) %>% 
  filter(!timepoint_cg_n <= 2,
         !timepoint_cgGFAC_n <= 2,
         !cg_id == "NA") %>% 
  mutate(tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))
``` 

# Define parameters for plots

```{r define-parameters-for-plots}
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$Variant_Classification

# Define label for plots
Alteration_type <- df_plot$Variant_Classification

# Define ylim
ylim <- max(df_plot$log10_tmb)
```

# What type of alterations we observe per tumor descriptor?

```{r plot-timepoint, fig.width = 15, fig.height = 8, fig.fullwidth = TRUE}
# Create bxp
print(ggpubr::ggboxplot(df_plot, 
                        x = "tumor_descriptor", 
                        y = "log10_tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        theme_Publication() + 
        scale_y_continuous(limits = c(0, ylim)) +
        xlab("Timepoint") +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = "Alteration_type_timepoints.pdf", 
       path = plots_dir, 
       width = 15, 
       height = 8, 
       device = "pdf", 
       useDingbats = FALSE)
```


# What type of alterations we observe per tumor descriptor in each cancer group?

```{r plot-cg-id, fig.width = 25, fig.height = 18, fig.fullwidth = TRUE}
# Create bxp
print(ggpubr::ggboxplot(df_plot, 
                        x = "tumor_descriptor", 
                        y = "log10_tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        facet_wrap(~cg_id) +
        theme_Publication() + 
        xlab("Timepoint") +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = "Alteration_type_cancer_group.pdf", 
       path = plots_dir, 
       width = 25, 
       height = 18, 
       device = "pdf", 
       useDingbats = FALSE)
```


# What type of alterations we observe per tumor descriptor in each cancer group defined by cgGFAC?

```{r plot-cgGFAC-n, fig.width = 14, fig.height = 8, fig.fullwidth = TRUE}
df_plot_cgGFAC <- df_plot %>% 
  arrange(tumor_descriptor_cgGFAC_n)

# Create bxp
print(ggpubr::ggboxplot(df_plot_cgGFAC, 
                        x = "tumor_descriptor_cgGFAC_n", 
                        y = "log10_tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        theme_Publication() + 
        xlab("Timepoint") +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))

# Save the plot
ggsave(filename = "Alteration_type_cgGFAC.pdf", 
       path = plots_dir, 
       width = 14, 
       height = 8, 
       device = "pdf", 
       useDingbats = FALSE)

```


```{r plot-cgGFAC-n-individual-plots, fig.width = 8, fig.height = 6, fig.fullwidth = TRUE}
cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC))
cgGFAC_id

# Loop through variable
for (i in seq_along(cgGFAC_id)){
  print(i)
  df_sub <- df_plot_cgGFAC %>%
      filter(cgGFAC == cgGFAC_id[i])

  
   # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = "tumor_descriptor_cgGFAC_n", 
                        y = "log10_tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        theme_Publication() + 
        labs(title = paste(cgGFAC_id[i])) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))
}
```

# What type of alterations we observe per tumor descriptor in each cancer group (add _n))?
 
```{r plot-n, fig.width = 12, fig.height = 8, fig.fullwidth = TRUE}
cg <- as.character(unique(df_plot$cg_id))
cg

# Loop through variable
for (i in seq_along(cg)){
  print(i)
  df_sub <- df_plot %>%
      filter(cg_id == cg[i])
  
  # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = "tumor_descriptor_cg_n", 
                        y = "log10_tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        theme_Publication() + 
        xlab("Timepoint") +
        labs(title = paste(cg[i])) +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))
}
```


# What type of alterations we observe per tumor descriptor in each cancer group and timepoint model?

```{r plot-timepoint-model, fig.width = 25, fig.height = 18, fig.fullwidth = TRUE}
tm <- as.character(unique(df_plot$timepoints_models))
tm

# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- df_plot %>%
      filter(timepoints_models == tm[i])
  
  # Create bxp
  print(ggpubr::ggboxplot(df_sub, 
                        x = "tumor_descriptor", 
                        y = "tmb", 
                        color = "Variant_Classification",
                        palette = palette) +
        facet_wrap(~cancer_group) +
        theme_Publication() +
        ylab("TMB") +
        xlab("Timepoint") +
        scale_y_continuous(limits = c(0, ylim)) +
        theme(axis.text.x = element_text(angle = 90)))
}
```


```{r echo=TRUE}
sessionInfo()
```
